
#get distances
data_dist<-data_student_raw %>%
  ungroup() %>%
  filter(an_adm %in% 2004:2015) %>%
  filter(liceu_repartizat!='' & !is.na(liceu_repartizat)) %>%
  filter(!is.na(dist)) %>%
  mutate(id_adm=row_number())

#get cutoffs
data_cutoffs<-data_dist %>%
  group_by(an_adm,judet_adm,liceu_repartizat,specializare_adm,specializare_lb) %>%
  summarise(cutoff=min(entrance_perc,na.rm=T),mean_e=mean(entrance_perc,na.rm=T))

#create potential options for each stduent  
data_option<-data_dist %>%
  filter(dist<=10000) %>%
  group_by(judet_adm,an_adm,scoala_de_provenienta,liceu_repartizat,dist) %>%
  summarise() 

#merge in cutoffs and optionsd
data_options<-data_dist %>%
  filter(dist<=10000) %>%
  dplyr::select(id_adm,an_adm,judet_adm,liceu_repartizat,scoala_de_provenienta,dist,entrance_perc,specializare_adm,specializare_lb) %>%
  left_join(data_option,by=c("an_adm","scoala_de_provenienta"),suffix=c("","_option")) %>%
  left_join(data_cutoffs,by=c("judet_adm","an_adm","liceu_repartizat","specializare_adm","specializare_lb")) %>%
  left_join(data_cutoffs,by=c("judet_adm_option"="judet_adm",
                              "an_adm"="an_adm",
                              "liceu_repartizat_option"="liceu_repartizat"
                              ),suffix=c("","_option")) 
  
#get feasible options
data_options_feasible<-data_options %>%
  #filter unoption cutoffs
  filter(entrance_perc>=cutoff_option ) %>%
  arrange(an_adm,id_adm,-mean_e_option) %>%
  #group by student
  group_by(an_adm,id_adm) %>% 
  mutate(#
    rank=row_number(),
    n_options=n_distinct(liceu_repartizat_option),
    n_options_tracks=n(),
    score=ifelse(n_options_tracks!=1,(n_options_tracks-rank)/(n_options_tracks-1),1) # find number of relevant options
  ) %>%
  ungroup() %>%
  dplyr::select(id_adm,an_adm,scoala_de_provenienta,liceu_repartizat,liceu_repartizat_option,entrance_perc,
                cutoff,cutoff_option,specializare_adm,specializare_lb,
                dist,dist_option,mean_e,mean_e_option,n_options,n_options_tracks,rank,score,judet_adm) %>%
  mutate(attend=cutoff==cutoff_option)   %>%
  arrange(an_adm,id_adm)

data_options_feasible<-data_options_feasible %>%
  mutate(n_options_group=case_when(
    n_options==1 ~ '1',
    n_options==2 ~ '2',
    n_options==3 ~ '3',
    n_options %in% 4:15 ~ '4-15',
    n_options %in% 16:200 ~ '16+',
    TRUE ~ NA_character_
  ))

data_options_feasible<-data_options_feasible %>%
  mutate(n_options_group_track=case_when(
    n_options_tracks==1 ~ '1',
    n_options_tracks==2 ~ '2',
    n_options_tracks==3 ~ '3',
    n_options_tracks %in% 4:15 ~ '4-15',
    n_options_tracks %in% 16:200 ~ '16+',
    TRUE ~ NA_character_
  ))


data_options_feasible<-data_options_feasible %>%
  mutate(dist=dist/1000,
         dist_option=dist_option/1000,
         mean_e=mean_e*100,
         mean_e_option=mean_e_option*100)

data_options_summary<-data_options_feasible %>%
  #slice(1:1000) %>%
  filter(n_options_group_track!=1) %>%
  group_by(id_adm,entrance_perc,scoala_de_provenienta,n_options_group_track,an_adm,judet_adm,dist,cutoff) %>%
  summarise(
    dist_closest=min(dist_option),
    dist_top=dist_option[rank==1],
    attend_top=attend[rank==1]==T,
    adm_score_closest=max(mean_e_option[dist_option==dist_closest]),
    adm_score_top=mean_e_option[rank==1],
    closest=dist_closest==dist_top)

data_options_summary<-data_options_summary %>%
  ungroup() %>%
  mutate(dist_diff_1_closest=dist_top-dist_closest,
         adm_score_diff_1_closest=adm_score_top-adm_score_closest)



#models
model16<-feols(data=data_options_summary %>% filter(n_options_group_track=='16+'),
               I(attend_top*100)~
                 dist_diff_1_closest+
                 adm_score_diff_1_closest
               ,
               cluster=~judet_adm)
summary(model16)

model3<-feols(data=data_options_summary %>% filter(n_options_group_track=='3'),
              I(attend_top*100)~
                dist_diff_1_closest+
                adm_score_diff_1_closest
              ,
              cluster=~judet_adm)
summary(model3)

model4<-feols(data=data_options_summary %>% filter(n_options_group_track=='4-15'),
              I(attend_top*100)~
                dist_diff_1_closest+
                adm_score_diff_1_closest
              ,
              cluster=~judet_adm)
summary(model4)


model2<-feols(data=data_options_summary %>% filter(n_options_group_track=='2'),
              I(attend_top*100)~
                dist_diff_1_closest+
                adm_score_diff_1_closest
              ,
              cluster=~judet_adm)
summary(model2)

###summary
#top choice vs choices
#how often rank 1?
first_11<-with(data_options_feasible %>% filter(attend==T & n_options_group_track==2),sum(rank==1)/length(rank))
first_12<-with(data_options_feasible %>% filter(attend==T & n_options_group_track==3),sum(rank==1)/length(rank))
first_13<-with(data_options_feasible %>% filter(attend==T & n_options_group_track=='4-15'),sum(rank==1)/length(rank))
first_14<-with(data_options_feasible %>% filter(attend==T & n_options_group_track=='16+'),sum(rank==1)/length(rank))

#how often rank 1-2?
first_21<-with(data_options_feasible %>% filter(attend==T & n_options_group==2),sum(rank %in% c(1,2))/length(rank))
first_22<-with(data_options_feasible %>% filter(attend==T & n_options_group==3),sum(rank %in% c(1,2))/length(rank))
first_23<-with(data_options_feasible %>% filter(attend==T & n_options_group=='4-15'),sum(rank %in% c(1,2))/length(rank))
first_24<-with(data_options_feasible %>% filter(attend==T & n_options_group=='16+'),sum(rank %in% c(1,2))/length(rank))

#how often rank 1-3?
first_31<-with(data_options_feasible %>% filter(attend==T & n_options_group==2),sum(rank %in% c(1,2,3))/length(rank))
first_32<-with(data_options_feasible %>% filter(attend==T & n_options_group==3),sum(rank %in% c(1,2,3))/length(rank))
first_33<-with(data_options_feasible %>% filter(attend==T & n_options_group=='4-15'),sum(rank %in% c(1,2,3))/length(rank))
first_34<-with(data_options_feasible %>% filter(attend==T & n_options_group=='16+'),sum(rank %in% c(1,2,3))/length(rank))


#mean rank of choice?
with(data_options_feasible %>% filter(attend==T & n_options_group_track==2),mean(rank))
with(data_options_feasible %>% filter(attend==T & n_options_group_track==3),mean(rank))
with(data_options_feasible %>% filter(attend==T & n_options_group_track=='4-15'),mean(rank))
with(data_options_feasible %>% filter(attend==T & n_options_group_track=='16+'),mean(rank))

#mean score 0-1?
rank_11<-with(data_options_feasible %>% filter(attend==T & n_options_group_track==2),mean(score))
rank_12<-with(data_options_feasible %>% filter(attend==T & n_options_group_track==3),mean(score))
rank_13<-with(data_options_feasible %>% filter(attend==T & n_options_group_track=='4-15'),mean(score))
rank_14<-with(data_options_feasible %>% filter(attend==T & n_options_group_track=='16+'),mean(score))

#high schools?
n1<-with(data_options_feasible %>% filter(n_options_group_track==2) %>% group_by(id_adm) %>% summarise(n=n()),mean(n))
n2<-with(data_options_feasible %>% filter(n_options_group_track==3)  %>% group_by(id_adm) %>% summarise(n=n()),mean(n))
n3<-with(data_options_feasible %>% filter(n_options_group_track=='4-15')  %>% group_by(id_adm) %>% summarise(n=n()),mean(n))
n4<-with(data_options_feasible %>% filter(n_options_group_track=='16+')  %>% group_by(id_adm) %>% summarise(n=n()),mean(n))

#dist?
d1<-with(data_options_feasible %>% filter(n_options_group_track==2) %>% group_by(id_adm) %>% summarise(n=mean(dist_option)),mean(n))
d2<-with(data_options_feasible %>% filter(n_options_group_track==3)  %>% group_by(id_adm) %>% summarise(n=mean(dist_option)),mean(n))
d3<-with(data_options_feasible %>% filter(n_options_group_track=='4-15')  %>% group_by(id_adm) %>% summarise(n=mean(dist_option)),mean(n))
d4<-with(data_options_feasible %>% filter(n_options_group_track=='16+')  %>% group_by(id_adm) %>% summarise(n=mean(dist_option)),mean(n))

#dist_choice
d1a<-with(data_options_feasible %>% filter(n_options_group_track==2) %>% group_by(id_adm) %>% filter(attend==1) %>% summarise(n=mean(dist_option)),mean(n))
d2a<-with(data_options_feasible %>% filter(n_options_group_track==3)  %>% group_by(id_adm) %>% filter(attend==1) %>% summarise(n=mean(dist_option)),mean(n))
d3a<-with(data_options_feasible %>% filter(n_options_group_track=='4-15')  %>% group_by(id_adm) %>% filter(attend==1) %>% summarise(n=mean(dist_option)),mean(n))
d4a<-with(data_options_feasible %>% filter(n_options_group_track=='16+')  %>% group_by(id_adm) %>% filter(attend==1) %>% summarise(n=mean(dist_option)),mean(n))


###########
obs<-c(model2$nobs,model3$nobs,model4$nobs,model16$nobs)
cor<-c(model2$sq.cor,model3$sq.cor,model4$sq.cor,model16$sq.cor)
first<-c(first_11,first_12,first_13,first_14)
second<-c(first_21,first_22,first_23,first_24)
third<-c(first_31,first_32,first_33,first_34)
rank<-c(rank_11,rank_12,rank_13,rank_14)
n<-c(n1,n2,n3,n4)
d<-c(d1,d2,d3,d4)
da<-c(d1a,d2a,d3a,d4a)





summary2<-rbind(
  formatC(t(obs),format="f",digits=0,big.mark = ","),
  formatC(t(cor),format="f",digits=2,big.mark = ","),
  paste0(formatC(t(first)*100,format="f",digits=0,big.mark = ","),"%"),
  paste0(formatC(t(second)*100,format="f",digits=0,big.mark = ","),"%"),
  paste0(formatC(t(third)*100,format="f",digits=0,big.mark = ","),"%"),
  formatC(t(rank),format="f",digits=2,big.mark = ","),
  formatC(t(n),format="f",digits=1,big.mark = ","),
  formatC(t(d),format="f",digits=1,big.mark = ","),
  formatC(t(da),format="f",digits=1,big.mark = ","))
summary2<-as.data.frame(summary2)
names<-c("Observations","Pseudo R2",
         "Attend Highest Admission Score Track",
         "Attend Top-2 Admission Score Track",
         "Attend Top-3 Admission Score Track",
         "Average Rank of Track Attended (0-low, 1-high)",
         "Mean Number of Tracks in Choice Set","Mean Distance to Track in Choice Set (km)","Mean Distance to Track Attended (km)")
summary2<-cbind(names,summary2)



#######
options(modelsummary_format_numeric_latex='plain')
f_big<-function(x) format(x, big.mark=",", scientific=FALSE, nsmall=3,digits=3)
f <- function(x) format(round(x/1000000,1) , nsmall = 3)


current_path<-rstudioapi::getActiveDocumentContext()$path
setwd(dirname(current_path))
fileConn<-file("02_table_02_track_appendix.txt")
writeLines(print(modelsummary(list("2"=model2,
                                   "3"=model3,
                                   "4-15"=model4,
                                   "16+"=model16
),
estimate="{estimate}{stars}",
statistic = "std.error",
stars=c('^{*}'=0.1,'^{**}'=0.05,'^{***}'=0.01),
gof_map=list(list("raw" = "nobs", "clean" = "N", "fmt" = f_big),
             list("raw" = "r.squared", "clean" = "R$^2$",fmt="%.2f")),
add_rows = summary2,
metrics="R2",
output="latex",
fmt=2,
escape=T)
), fileConn)
close(fileConn)





